
# ==============================================================================
# DESCRIPTION ----
# ==============================================================================

# This code combines all financial data from Nikkei NEEDS into a single time 
# series dataset

# NOTE: NIKKEI NEEDS DATA CANNOT BE PROVIDED FOR LEGAL REASONS.

# ==============================================================================
# LIBRARIES AND IMPORT ----
# ==============================================================================

library(tidyverse)
library(janitor)
library(lubridate)

# ==============================================================================
# INDUSTRIAL: READ IN ALL DATA FILES, CLEAN, AND APPEND INTO ONE DATAFRAME ----
# ==============================================================================

#### Create list of all data files ####
fq <- list.files("data/nikkei_needs/financial_information", full.names = T)

####  Loop through list, import data, and clean data #### 
fq_test <- readxl::read_excel("data/nikkei_needs/financial_information/N0000001-N0000099.xlsx", sheet = "Sheet1", skip = 1)

fq_test_clean <- fq_test %>%
  slice(-1) %>% slice(-1) %>% slice(-1) %>%
  clean_names() %>%
  rename(company = x1, date = x2, nikkei_code = nikkei_company_code)

# Create empty list to store dataframes
fq_df <- list()

# Begin to loop through all dataframes in directory
for (i in 1:length(fq)){
  fq_df[[i]] <- readxl::read_excel(fq[i], sheet = "Sheet1", skip = 1)
  
  #### Clean data ####
  fq_df[[i]] <- fq_df[[i]] %>%
    
    # Remove blank first row of data
    slice(-1) %>% slice(-1) %>% slice(-1) %>%
    
    # Standardize column names
    clean_names() %>%
    
    # Rename variables
    rename(company = x1, date = x2, nikkei_code = nikkei_company_code) %>%
    
    # Extract year and month from date variable
    mutate( 
      year = as.numeric(substr(date, start = 1, stop = 4)),
      month = as.numeric(substr(date, start = 6, stop = 7))
    ) %>%
    
    # Keep needed variables only
    select(-date, -nikkei_industory_code, -fiscal_year_end)
  
}

####  Append all cleaned data into one large data frame #### 
nikkei_financials <- bind_rows(fq_df)
rm(fq_df)

#### Remove companies or columns with no data ###
nikkei_financials <- nikkei_financials %>%
  relocate(year, month, .after = nikkei_code) %>%
  filter_at(vars(6:33), any_vars(!is.na(.)))

# ==============================================================================
# COMMERCIAL BANKS: READ IN ALL DATA FILES, CLEAN, AND ADD TO DATAFRAME ----
# ==============================================================================

# Import commercial bank data
banks1 <- readxl::read_excel("data/nikkei_needs/banks/banks_1.xlsx", sheet = "Sheet1", skip = 1)
banks2 <- readxl::read_excel("data/nikkei_needs/banks/banks_2.xlsx", sheet = "Sheet1", skip = 1)
banks3 <- readxl::read_excel("data/nikkei_needs/banks/banks_3.xlsx", sheet = "Sheet1", skip = 1)

# Combine bank data
banks <- rbind(banks1, banks2, banks3)
rm(banks1, banks2, banks3)

# Clean bank data: same operations as other corporations above
banks <- banks %>%
  slice(-1) %>% slice(-1) %>% slice(-1) %>%
  clean_names() %>%
  rename(company = x1, date = x2, nikkei_code = nikkei_company_code) %>%
  mutate( 
    year = as.numeric(substr(date, start = 1, stop = 4)),
    month = as.numeric(substr(date, start = 6, stop = 7))
  ) %>%
  mutate_if(is.character, na_if, c('-')) %>%
  filter(!is.na(company)) %>% # Removes Nikkei Excel labels from  
  remove_empty(which = "cols") %>% # drops columns that are all NA
  select(-date, -fiscal_year_end) %>%
  rename(       # Match banking variable names with industrial corps
    total_liabilities = liabilities
    )

# Combine with other corporation financial data
nikkei_financials <- bind_rows(nikkei_financials, banks)

# ==============================================================================
# REGIONAL BANKS: READ IN ALL DATA FILES, CLEAN, AND ADD TO DATAFRAME ----
# ==============================================================================

# Import regional bank data
shinkin1 <- readxl::read_excel("data/nikkei_needs/shinkin/shinkin1.xlsx", sheet = "Sheet1", skip = 1)
shinkin2 <- readxl::read_excel("data/nikkei_needs/shinkin/shinkin2.xlsx", sheet = "Sheet1", skip = 1)
shinkin3 <- readxl::read_excel("data/nikkei_needs/shinkin/shinkin3.xlsx", sheet = "Sheet1", skip = 1)
shinkin4 <- readxl::read_excel("data/nikkei_needs/shinkin/shinkin4.xlsx", sheet = "Sheet1", skip = 1)

# Combine bank data
shinkin <- rbind(shinkin1, shinkin2, shinkin3, shinkin4)
rm(shinkin1, shinkin2, shinkin3, shinkin4)

# Clean bank data: same operations as other corporations above
shinkin <- shinkin %>%
  slice(-1) %>% slice(-1) %>% slice(-1) %>%
  clean_names() %>%
  rename(company = x1, date = x2, nikkei_code = nikkei_company_code) %>%
  mutate( 
    year = as.numeric(substr(date, start = 1, stop = 4)),
    month = as.numeric(substr(date, start = 6, stop = 7))
  ) %>%
  mutate_if(is.character, na_if, c('-')) %>%
  filter(!is.na(company)) %>% # Removes Nikkei Excel labels from  
  remove_empty(which = "cols") %>% # drops columns that are all NA
  select(-date, -fiscal_year_end)

# Combine with other corporation financial data
nikkei_financials <- bind_rows(nikkei_financials, shinkin)

# ==============================================================================
# INSURANCE: READ IN ALL DATA FILES, CLEAN, AND ADD TO DATAFRAME ----
# ==============================================================================

# Import insurance data
insurance1 <- readxl::read_excel("data/nikkei_needs/insurance/insurance_1.xlsx", sheet = "Sheet1", skip = 1)
insurance2 <- readxl::read_excel("data/nikkei_needs/insurance/insurance_2.xlsx", sheet = "Sheet1", skip = 1)

# Combine insurance data
insurance <- rbind(insurance1, insurance2)
rm(insurance1, insurance2)

# Clean insurance data: same operations as other corporations above
insurance <- insurance %>%
  slice(-1) %>% slice(-1) %>% slice(-1) %>%
  clean_names() %>%
  rename(company = x1, date = x2, nikkei_code = nikkei_company_code) %>%
  mutate( 
    year = as.numeric(substr(date, start = 1, stop = 4)),
    month = as.numeric(substr(date, start = 6, stop = 7))
  ) %>%
  mutate_if(is.character, na_if, c('-')) %>%
  select(-date, -fiscal_year_end) %>%
  filter(!is.na(company)) %>%
  remove_empty(which = "cols") %>% # drops columns that are all NA
  rename(
    total_liabilities = liabilities,
    gross_profit_cumulative_total = core_profit)

# Combine with other corporation financial data
nikkei_financials <- bind_rows(nikkei_financials, insurance)

# ==============================================================================
# SECURITIES: READ IN ALL DATA FILES, CLEAN, AND ADD TO DATAFRAME ----
# ==============================================================================

# Import insurance data
securities1 <- readxl::read_excel("data/nikkei_needs/securities/securities_1.xlsx", sheet = "Sheet1", skip = 1)
securities2 <- readxl::read_excel("data/nikkei_needs/securities/securities_2.xlsx", sheet = "Sheet1", skip = 1)
securities3 <- readxl::read_excel("data/nikkei_needs/securities/securities_3.xlsx", sheet = "Sheet1", skip = 1)

# Combine insurance data
securities <- rbind(securities1, securities2, securities3)
rm(securities1, securities2, securities3)

# Clean insurance data: same operations as other corporations above
securities <- securities %>%
  slice(-1) %>% slice(-1) %>% slice(-1) %>%
  clean_names() %>%
  rename(company = x1, date = x2, nikkei_code = nikkei_company_code) %>%
  mutate( 
    year = as.numeric(substr(date, start = 1, stop = 4)),
    month = as.numeric(substr(date, start = 6, stop = 7))
  ) %>%
  mutate_if(is.character, na_if, c('-')) %>%
  select(-date, -fiscal_year_end) %>%
  filter(!is.na(company)) %>%
  remove_empty(which = "cols") %>%
  rename(
    total_liabilities = liabilities,
    ebitda_a = net_income_before_income_taxes_cumulative_total) # Closest

# Combine with other corporation financial data
nikkei_financials <- bind_rows(nikkei_financials, securities)

# ==============================================================================
# INSURANCE EXPENSES: READ IN ALL DATA FILES, CLEAN, AND ADD TO DATAFRAME ----
# ==============================================================================

# Import data on insurance company expenditures (note: does not exist for other industries)
expenses_1 <- readxl::read_excel("data/nikkei_needs/expenses/expenses_insurance_1.xlsx", sheet = "Sheet1", skip = 1)
expenses_2 <- readxl::read_excel("data/nikkei_needs/expenses/expenses_insurance_2.xlsx", sheet = "Sheet1", skip = 1)

# Combine expenses data
expenses <- rbind(expenses_1, expenses_2)
rm(expenses_1, expenses_2)

# Clean expenses data: same operations as other corporations above
expenses <- expenses %>%
  slice(-1) %>% slice(-1) %>% slice(-1) %>%
  clean_names() %>%
  rename(company = x1, date = x2) %>%
  mutate( 
    year = as.numeric(substr(date, start = 1, stop = 4)),
    month = as.numeric(substr(date, start = 6, stop = 7))
  ) %>%
  mutate_if(is.character, na_if, c('-')) %>%
  select(-date) %>%
  filter(!is.na(company)) %>%
  remove_empty(which = "cols")

# Save expenses data
save(expenses, file = "data/nikkei_needs/expenses/insurance_expenses.RData")

# ==============================================================================
# PRELIMINARY CLEANING ----
# ==============================================================================

# Remove duplicates #
# Duplicates exist where multiple totals of months in a fiscal year are 
# reported. The code below keeps values where months in fiscal year = 12. 
nikkei_financials <- nikkei_financials %>%
  arrange(nikkei_code, year, no_of_months_in_fiscal_period) %>%
  distinct(nikkei_code, year, .keep_all = TRUE)

# Replace hyphen with NA
nikkei_financials <- nikkei_financials %>%
  mutate_if(is.character, na_if, c('-'))

# Convert relevant columns to numeric
nikkei_financials <- nikkei_financials %>% mutate_at(c(9:155), as.numeric)

# ==============================================================================
# MERGE FINANCIAL WITH COPRORATE ATTRIBUTE DATA ----
# ==============================================================================

# Import nikkei corporate data
nikkei_attributes <- read_csv("data/nikkei_needs/attributes/nikkei_company_details.csv")

# Re-label securities, futures, and financing as single "finance" industry
nikkei_attributes <- nikkei_attributes %>%
  mutate(industry = case_when(
      industry == "Securities & Commodity Futures" |
      industry == "Other Financing Business" ~ 
        "Finance",
      industry == "Nonclassifiable" & industry_detail == "Credit & Leasing" ~ 
        "Finance",
      TRUE ~ industry
    ))

save(nikkei_attributes, file = "data/nikkei_needs/attributes/nikkei_attributes.RData")
  
# Expand attribute data to time series format
nikkei_time <- nikkei_attributes %>% 
  slice(rep(1:n(), each = 21)) %>% 
  mutate(year = rep(2000:2020, times = nrow(nikkei_attributes)))

# Add leading N to financials
nikkei_financials <- nikkei_financials %>%
  mutate(nikkei_code = str_pad(nikkei_code, width = 8, pad = "N"))

# Perform merge
nikkei_time <- left_join(nikkei_time, nikkei_financials, 
                               by = c("nikkei_code", "year"))

# ==============================================================================
# MERGE LOAN DATA WITH AMAKUDARI DATA BY NIKKEI CODE ----
# ==============================================================================

# Import amakudari data
amakudari <- read_csv("data/amakudata.csv")

# Keep necessary amakudari variables only
amakudari_reduced <- amakudari %>% 
  mutate(year = year(date_ret)) %>% 
  group_by(year, nikkei_code) %>%
  summarise(amakudari_n = n()) %>%
  select(year, nikkei_code, amakudari_n) %>%
  mutate(amakudari_binary = "1") %>%
  filter(!is.na(nikkei_code))

# Repeat above but by ministry
amakudari_ministry <- amakudari %>%
  mutate(year = year(date_ret)) %>% 
  group_by(year, nikkei_code, ministry_short) %>%
  summarise(amakudari_n = n()) %>%
  select(year, nikkei_code, amakudari_n, ministry_short) %>%
  mutate(amakudari_binary = "1") %>%
  filter(!is.na(nikkei_code)) %>%
  mutate(ministry_short = ifelse(ministry_short == -99, "Other", ministry_short)) %>%
  # Convert by-ministry summary to wide format for merging
  pivot_wider(
    names_from = ministry_short,
    values_from = c(amakudari_n, amakudari_binary)
  ) %>%
  # Convert NA values to zero
  mutate(across(amakudari_n_Other:amakudari_binary_MOFA, as.numeric)) %>%
  replace(is.na(.), 0)

# Merge loan data with amakudari data
nikkei_time <- left_join(nikkei_time, amakudari_reduced, 
                         by = c("year", "nikkei_code"))

nikkei_time <- left_join(nikkei_time, amakudari_ministry, 
                         by = c("year", "nikkei_code"))

# Convert NA amakudari (i.e., no amakudari) values to 0
nikkei_time <- nikkei_time %>% 
  mutate(
    across(starts_with("amakudari"), as.numeric),
    across(starts_with("amakudari"), ~replace_na(.,0))
    )

shinkin <- nikkei_time %>% filter(str_detect(nikkei_code, "N0026976"))

# ==============================================================================
# FINALIZE DATA AND EXPORT ----
# ==============================================================================

# Shorten column names
nikkei_time <- nikkei_time %>%
  rename(
    months_in_fiscal_period = no_of_months_in_fiscal_period,
    parent_nikkei_code = parent_company_nikkei_company_code,
    parent_stock_code = parent_company_stock_code,
    sales_and_operating_revenue = sales_and_operating_revenue_tanshin_summary_cumulative_total,
    employees = number_of_employees_at_the_end_of_period,
    temp_employees = average_number_of_temporary_employee,
    roi = return_on_investment_roi,
    variable_cost_total = variable_cost_variable_cost_total,
    fixed_cost_total = fixed_cost_fixed_cost_total,
    leverage = financial_leverage,
    gross_profit = gross_profit_cumulative_total
    )
  
# Combine columns where two separate exist in Nikkei NEEDS
nikkei_time <- nikkei_time %>%
  mutate(
    operating_revenue = coalesce(sales_and_operating_revenue, 
                                 total_net_sales_and_operating_revenue),
    ebitda = coalesce(ebitda_a, ebitda_b),
    roe = coalesce(return_on_equity_a, roe_b) 
  ) %>%
  select(-sales_and_operating_revenue, -total_net_sales_and_operating_revenue, 
         -ebitda_a, -ebitda_b, -return_on_equity_a, -roe_b)

# Remove columns specific only to banks, insurance, and securites
nikkei_time <- nikkei_time %>%
  select(
    nikkei_code, tse_code, firm_dest_en, firm_dest_merge, firm_dest_kana,
    parent_nikkei_code, parent_stock_code, city, industry, industry_detail,
    date_founded, year, amakudari_binary, amakudari_n, 
    total_assets, total_liabilities, operating_revenue, 
    gross_profit, roi, ebitda, roe, leverage, reserve_ratio, 
    variable_cost_total, fixed_cost_total, employees, temp_employees,
    starts_with("amakudari")
   )

####  Export #### 
save(nikkei_time, file = "data/nikkei_financials.RData")
